#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName  :基于m4a_url直接下载.py
# @Time      :2024/4/25 
# @Author    :CL
# @email     :1037654919@qq.com
import os

import requests
from bs4 import BeautifulSoup

from util import mongo_manager
ximalaya_sound = mongo_manager('ximalaya_sound',db = 'public_data')
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Pragma": "no-cache",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Linux\""
}
def download_file_via_m4a_url(url, path,name):

    headers = {
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
    resp = requests.get(url, headers=headers)
    file_path = os.path.join(path,name+ '.'+url.split("/")[-1].split('.')[-1])
    if resp.status_code == 200:
        with open(file_path, "wb") as f:
            f.write(resp.content)

if __name__ == '__main__':
    print()
    while True:
        seed = ximalaya_sound.find_one_and_update({"m4a_url": {'$exists':True},"status": 'failed'},
                                                  {"status": 'running'})
        if seed :
            print(seed)
            if seed['m4a_url'].endswith('.m4a'):
                path = os.path.join( "/home/chenglei3/work/data/ximalaya/", seed['_type'], seed['h2'])
                os.makedirs(path, exist_ok=True)
                name = str(seed['title']).replace('/','_')
                # 下载音频
                download_file_via_m4a_url(seed['m4a_url'],path,name)

                # 下载文本
                sound_url = seed["_id"]
                res = requests.get(sound_url, headers=headers, timeout=10)
                soup = BeautifulSoup(res.text, 'lxml')
                # 获取音频简介
                article = soup.find('article', class_='intro').text
                # print(article)
                with open(f'{path}/{name}.txt', 'a', encoding='utf-8') as f:
                    f.write(article + '\n')

                print(f"{name} download success")
                seed['status'] = 'done'
                seed['status_text'] = 'done'
                ximalaya_sound.updateOne({"_id": seed['_id']}, seed)
        else:
            break